0. Data used:¶
- Transfermarkt: https://www.kaggle.com/datasets/davidcariboo/player-scores
1. Data Loading¶
In [107]:
# Import modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
In [108]:
import plotly, plotly.io as pio
from kaleido.scopes.plotly import PlotlyScope
pio.kaleido.scope = PlotlyScope()
pio.renderers.default = "notebook"
In [109]:
# Define data path
data_path = "../data"
# Load data
try:
df_tm_clubs = pd.read_csv(f"{data_path}/tm_clubs.csv")
df_tm_transfers = pd.read_csv(f"{data_path}/tm_transfers.csv")
df_tm_competitions = pd.read_csv(f"{data_path}/tm_competitions.csv")
print("Files loaded successfully")
except FileNotFoundError:
print(f"Error: One or more files at {data_path} were not found.")
exit()
Files loaded successfully
2. Data Cleaning¶
2.1. Data transformation¶
In [110]:
# Modification of trasnfer_season column to preserve order
def expand_season(season_str):
start_year = int(season_str[:2])
end_year = int(season_str[3:])
if start_year < 70: # Consider that are 20xx
start_full = 2000 + start_year
end_full = 2000 + end_year
else: # Consider that are 19xx
start_full = 1900 + start_year
end_full = 1900 + end_year
return f"{start_full}/{str(end_full)[-2:]}"
df_tm_transfers['transfer_season'] = df_tm_transfers['transfer_season'].apply(expand_season)
In [111]:
# Convert transfer_date into datetime type
df_tm_transfers['transfer_date'] = pd.to_datetime(df_tm_transfers['transfer_date'], errors='coerce')
# Remove data from 2025 onward
df_tm_transfers = df_tm_transfers[
df_tm_transfers['transfer_date'] <= pd.Timestamp('2024-12-31')
].copy()
In [112]:
# Still remain noisy data from 2025 onward
df_tm_transfers[
df_tm_transfers['transfer_season'].str.split('/').str[0].astype(int) >= 2025
]
Out[112]:
| player_id | transfer_date | transfer_season | from_club_id | to_club_id | from_club_name | to_club_name | transfer_fee | market_value_in_eur | player_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5950 | 1027067 | 2024-07-01 | 2025/26 | 67278 | 67279 | St. Johnst. U18 | St. Johnst. B | NaN | NaN | Bayley Klimionek |
| 6754 | 551752 | 2024-06-30 | 2025/26 | 8970 | 416 | Frosinone | Torino | 0.0 | 2000000.0 | Demba Seck |
In [113]:
# Complete the cleaning process
df_tm_transfers = df_tm_transfers[
df_tm_transfers['transfer_season'].str.split('/').str[0].astype(int) <= 2024
].copy()
df_tm_transfers[
df_tm_transfers['transfer_season'].str.split('/').str[0].astype(int) >= 2025
]
Out[113]:
| player_id | transfer_date | transfer_season | from_club_id | to_club_id | from_club_name | to_club_name | transfer_fee | market_value_in_eur | player_name |
|---|
2.2. Cleaning Missing Values¶
In [114]:
# Check missing values in the dataset
missing_values = df_tm_transfers.isnull().sum()
# Display the missing values for each column
missing_values
Out[114]:
player_id 0 transfer_date 0 transfer_season 0 from_club_id 0 to_club_id 0 from_club_name 0 to_club_name 0 transfer_fee 27512 market_value_in_eur 30211 player_name 0 dtype: int64
In [115]:
# Check how many rows have missing values in both columns, transfer_fee and market_value_in_eur
missing_both_values = df_tm_transfers[
df_tm_transfers['transfer_fee'].isnull() & df_tm_transfers['market_value_in_eur'].isnull()
]
# Show the result
len(missing_both_values)
Out[115]:
19383
In [116]:
# Check missing values by year
missing_by_year = (
df_tm_transfers
.groupby(df_tm_transfers['transfer_date'].dt.year)[['transfer_fee', 'market_value_in_eur']]
.apply(lambda group: pd.Series({
'missing_transfer_fee': group['transfer_fee'].isnull().sum(),
'missing_market_value': group['market_value_in_eur'].isnull().sum(),
'missing_both_values': (
group['transfer_fee'].isnull() &
group['market_value_in_eur'].isnull()
).sum()
}))
.reset_index()
.rename(columns={'transfer_date': 'year'})
)
missing_by_year
Out[116]:
| year | missing_transfer_fee | missing_market_value | missing_both_values | |
|---|---|---|---|---|
| 0 | 1993 | 0 | 1 | 0 |
| 1 | 1994 | 2 | 3 | 2 |
| 2 | 1995 | 0 | 1 | 0 |
| 3 | 1996 | 2 | 3 | 2 |
| 4 | 1997 | 4 | 6 | 4 |
| 5 | 1998 | 6 | 16 | 6 |
| 6 | 1999 | 17 | 22 | 17 |
| 7 | 2000 | 17 | 30 | 17 |
| 8 | 2001 | 30 | 69 | 30 |
| 9 | 2002 | 42 | 77 | 42 |
| 10 | 2003 | 71 | 122 | 71 |
| 11 | 2004 | 120 | 199 | 120 |
| 12 | 2005 | 160 | 282 | 154 |
| 13 | 2006 | 244 | 382 | 234 |
| 14 | 2007 | 356 | 542 | 336 |
| 15 | 2008 | 452 | 613 | 406 |
| 16 | 2009 | 586 | 798 | 518 |
| 17 | 2010 | 717 | 990 | 625 |
| 18 | 2011 | 870 | 1220 | 736 |
| 19 | 2012 | 1058 | 1417 | 894 |
| 20 | 2013 | 1197 | 1587 | 986 |
| 21 | 2014 | 1328 | 1748 | 1075 |
| 22 | 2015 | 1545 | 1951 | 1234 |
| 23 | 2016 | 1624 | 2006 | 1294 |
| 24 | 2017 | 1908 | 2167 | 1414 |
| 25 | 2018 | 2072 | 2377 | 1534 |
| 26 | 2019 | 2189 | 2492 | 1587 |
| 27 | 2020 | 2454 | 2629 | 1801 |
| 28 | 2021 | 2323 | 2231 | 1509 |
| 29 | 2022 | 2320 | 1990 | 1367 |
| 30 | 2023 | 1934 | 1450 | 932 |
| 31 | 2024 | 1864 | 790 | 436 |
2.3. Missing values per season¶
In [117]:
# Create column of date for the continuous X axis
missing_by_year['year_date'] = pd.to_datetime(missing_by_year['year'].astype(str) + '-01-01')
# Convert to long format for plotly
df_long = missing_by_year.melt(
id_vars=['year', 'year_date'],
value_vars=['missing_transfer_fee', 'missing_market_value', 'missing_both_values'],
var_name='metric',
value_name='missing_count'
)
# Rename metrics for the legend
name_map = {
'missing_transfer_fee': 'Missing Transfer Fee',
'missing_market_value': 'Missing Market Value',
'missing_both_values': 'Missing Both Values'
}
df_long['metric'] = df_long['metric'].map(name_map)
# Create figure
fig = px.line(
df_long.sort_values('year_date'),
x='year_date',
y='missing_count',
color='metric',
markers=True,
hover_data={'year': True, 'year_date': False}
)
# Layout
ordered = missing_by_year.sort_values('year_date')
fig.update_layout(
title='Missing Values by Transfer Year',
xaxis_title='Year',
yaxis_title='Number of Missing Values',
legend_title='Metric',
template='plotly_dark',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
xaxis=dict(
tickmode='array',
tickvals=ordered['year_date'],
ticktext=ordered['year'].astype(str),
tickangle=55
)
)
fig.show()
3. EDA (Exploratory Data Analysis)¶
3.1. Create the Cleaned Dataset by Removing Rows with Missing Values¶
In [118]:
# Create a new dataframe with the cleaned data
df_tm_transfers_cleaned = df_tm_transfers.dropna(
subset=['transfer_fee', 'market_value_in_eur'],
how='any'
).copy()
# Show the dataset information that have been recently cleaned
print(f"Dataset Shape: {df_tm_transfers_cleaned.shape}")
print(f"\nColumn Data Types:\n{df_tm_transfers_cleaned.dtypes}")
print(f"\nMissing Values:\n{df_tm_transfers_cleaned.isnull().sum()}")
Dataset Shape: (39378, 10) Column Data Types: player_id int64 transfer_date datetime64[ns] transfer_season object from_club_id int64 to_club_id int64 from_club_name object to_club_name object transfer_fee float64 market_value_in_eur float64 player_name object dtype: object Missing Values: player_id 0 transfer_date 0 transfer_season 0 from_club_id 0 to_club_id 0 from_club_name 0 to_club_name 0 transfer_fee 0 market_value_in_eur 0 player_name 0 dtype: int64
In [119]:
# Check missing values in the dataset
missing_values = df_tm_transfers_cleaned.isnull().sum()
# Display the missing values for each column
missing_values
Out[119]:
player_id 0 transfer_date 0 transfer_season 0 from_club_id 0 to_club_id 0 from_club_name 0 to_club_name 0 transfer_fee 0 market_value_in_eur 0 player_name 0 dtype: int64
3.2. Number of Transfers per Year (Cleaned Dataset)¶
In [120]:
# Count by year
counts = (
df_tm_transfers_cleaned['transfer_date']
.dt.year
.value_counts()
.rename_axis('year')
.sort_index()
.reset_index(name='n_transfers')
)
# 3-year rolling average
counts['roll3'] = counts['n_transfers'].rolling(3, center=True, min_periods=2).mean()
# Interactive plot
fig = px.bar(
counts,
x='year',
y='n_transfers',
color='n_transfers',
text='n_transfers',
title='Total Number of Transfers per Year',
hover_data={'year': True}
)
fig.update_traces(textposition='outside', cliponaxis=False)
# Add trend line
fig.add_trace(go.Scatter(
x=counts['year'],
y=counts['roll3'],
mode='lines+markers',
name='3Y Rolling Avg',
hovertemplate='Year: %{x}<br>3Y Rolling Avg: %{y:.0f}<extra></extra>'
))
# Style
fig.update_layout(
legend=dict(
orientation='v', # vertical
yanchor='top',
y=1,
xanchor='left',
x=0
),
template='plotly_dark',
xaxis_title='Year',
yaxis_title='Number of Transfers',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
xaxis=dict(
tickmode='array',
tickvals=counts['year'],
ticktext=counts['year'],
tickangle=55,
rangeslider=dict(visible=False),
rangeselector=dict(visible=False)
),
coloraxis_colorbar=dict(title='Transfers')
)
fig.show()
3.3. Numeric Distribution of Transfer Fees¶
In [121]:
# Prepare data (only valid transfer_fee)
series = pd.to_numeric(df_tm_transfers_cleaned['transfer_fee'], errors='coerce').dropna()
if series.empty:
raise ValueError("There are no valid values in 'transfer_fee'.")
# Statistics
mean_val = series.mean()
median_val = series.median()
# Manual binning to color by intensity
nbins = 40
counts, bin_edges = np.histogram(series, bins=nbins)
bin_left = bin_edges[:-1]
bin_right = bin_edges[1:]
bin_center = (bin_left + bin_right) / 2
df_bins = pd.DataFrame({
'bin_left': bin_left,
'bin_right': bin_right,
'bin_center': bin_center,
'count': counts
})
# Interactive plot
fig = px.bar(
df_bins,
x='bin_center',
y='count',
color='count',
text='count',
title='Distribution of Transfer Fees (Log Scale)',
labels={'bin_center': 'Transfer Fee (€)', 'count': 'Count'},
hover_data={'bin_left': ':.0f', 'bin_right': ':.0f', 'bin_center': ':.0f', 'count': ':.0f'}
)
fig.update_traces(textposition='outside', cliponaxis=False)
# Add peak annotation
idx_max = df_bins['count'].idxmax()
fig.add_annotation(
x=df_bins.loc[idx_max, 'bin_center'],
y=df_bins.loc[idx_max, 'count'],
text=f"Peak<br>{int(df_bins.loc[idx_max, 'count']):,}",
showarrow=True,
arrowhead=2,
yshift=10
)
# Style
fig.update_layout(
template='plotly_dark',
xaxis_title='Transfer Fee (€)',
yaxis_title='Count',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
legend=dict(
orientation='v',
yanchor='top', y=1,
xanchor='left', x=0
)
)
# X axis with more compact format
fig.update_xaxes(tickformat=',.0f')
fig.update_yaxes(tickformat=',.0f')
fig.update_yaxes(type='log')
fig.show()
3.3.1. Removing Free Transfers¶
In [122]:
# Filter: only valid transfer_fee and > 0 (exclude free)
series_all = pd.to_numeric(df_tm_transfers_cleaned['transfer_fee'], errors='coerce').dropna()
series = series_all[series_all > 0]
if series.empty:
raise ValueError("There are no valid values in 'transfer_fee' > 0 (excluding free).")
n_free = (series_all == 0).sum()
# Statistics (excluding free)
mean_val = series.mean()
median_val = series.median()
# 3) Bineado manual para colorear por intensidad
nbins = 40
counts, bin_edges = np.histogram(series, bins=nbins)
bin_left = bin_edges[:-1]
bin_right = bin_edges[1:]
bin_center = (bin_left + bin_right) / 2
df_bins = pd.DataFrame({
'bin_left': bin_left,
'bin_right': bin_right,
'bin_center': bin_center,
'count': counts
})
# Interactive plot
fig = px.bar(
df_bins,
x='bin_center',
y='count',
color='count',
text='count',
title='Distribution of Transfer Fees - Excluding Free Transfers (Log Scale)',
labels={'bin_center': 'Transfer Fee (€)', 'count': 'Count'},
hover_data={'bin_left': ':.0f', 'bin_right': ':.0f', 'bin_center': ':.0f', 'count': ':.0f'}
)
fig.update_traces(textposition='outside', cliponaxis=False)
# Style
fig.update_layout(
template='plotly_dark',
xaxis_title='Transfer Fee (€)',
yaxis_title='Count',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
legend=dict(orientation='v', yanchor='top', y=1, xanchor='left', x=0)
)
fig.update_xaxes(tickformat=',.0f')
fig.update_yaxes(tickformat=',.0f')
fig.update_yaxes(type='log')
# Add note with number of free excluded
if n_free > 0:
fig.add_annotation(
x=1, y=1.12, xref='paper', yref='paper',
text=f"Free transfers excluded: {n_free:,}",
showarrow=False, align='right'
)
fig.show()
3.4. Correlation between Market Value and Transfer Fees¶
In [123]:
# Filter and group by year
df_yearly = (
df_tm_transfers_cleaned
.loc[
(df_tm_transfers_cleaned['transfer_fee'] > 0)
]
.dropna(subset=['transfer_fee', 'market_value_in_eur', 'transfer_date'])
.assign(year=lambda d: d['transfer_date'].dt.year)
.loc[lambda d: d['year'] >= 2008]
.groupby('year', as_index=False)
.agg({
'market_value_in_eur': 'mean',
'transfer_fee': 'mean'
})
)
# Create figure with two lines
fig = go.Figure()
fig.add_trace(go.Scatter(
x=df_yearly['year'],
y=df_yearly['market_value_in_eur'],
mode='lines+markers',
name='Avg Market Value',
line=dict(color='royalblue', width=2)
))
fig.add_trace(go.Scatter(
x=df_yearly['year'],
y=df_yearly['transfer_fee'],
mode='lines+markers',
name='Avg Transfer Fee',
line=dict(color='orange', width=2)
))
# Style the graph
fig.update_layout(
title='Average Market Value vs Transfer Fee by Year (Excluding Free Transfers)',
xaxis_title='Year',
yaxis_title='Average Value (€)',
template='plotly_dark',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
xaxis=dict(
tickmode='linear',
dtick=1,
tickangle=55
)
)
fig.show()
3.5. Evalution of Average Spend by Transfer by Year¶
In [124]:
df_f = (
df_tm_transfers_cleaned
.dropna(subset=['transfer_fee', 'transfer_date'])
.loc[df_tm_transfers_cleaned['transfer_fee'] > 0]
.assign(year=lambda d: d['transfer_date'].dt.year)
.loc[lambda d: d['year'] >= 2008]
)
# Grouping by year
yearly = (
df_f.groupby('year', as_index=False)
.agg(
avg_fee=('transfer_fee', 'mean'),
n_transfers=('transfer_fee', 'size')
)
.sort_values('year')
)
# Optional: 3-year trailing average for the line
yearly['avg_fee_roll3'] = yearly['avg_fee'].rolling(3, center=False, min_periods=1).mean()
# Figure with double axis: line (avg) + bars (count)
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Bars: number of transfers (secondary axis)
fig.add_trace(
go.Bar(
x=yearly['year'],
y=yearly['n_transfers'],
name='Transfers',
opacity=0.35,
hovertemplate='Year: %{x}<br>Transfers: %{y}<extra></extra>'
),
secondary_y=True
)
# Main line: average transfer fee
fig.add_trace(
go.Scatter(
x=yearly['year'],
y=yearly['avg_fee'],
mode='lines+markers',
name='Avg Transfer Fee',
line=dict(width=3),
marker=dict(size=7),
hovertemplate='Year: %{x}<br>Avg Fee: €%{y:,.0f}<extra></extra>'
),
secondary_y=False
)
# Optional: smoothed line (3Y)
fig.add_trace(
go.Scatter(
x=yearly['year'],
y=yearly['avg_fee_roll3'],
mode='lines',
name='Avg Fee (3Y MA)',
line=dict(dash='dash'),
hovertemplate='Year: %{x}<br>3Y MA: €%{y:,.0f}<extra></extra>'
),
secondary_y=False
)
# Peak annotation
idx_max = yearly['avg_fee'].idxmax()
fig.add_annotation(
x=yearly.loc[idx_max, 'year'],
y=yearly.loc[idx_max, 'avg_fee'],
text=f"Peak €{yearly.loc[idx_max, 'avg_fee']:,.0f}",
showarrow=True,
arrowhead=2,
yshift=10
)
# Style the graph
fig.update_layout(
title='Average Transfer Fee per Year (Excluding Free Transfers)',
template='plotly_dark',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
legend=dict(orientation='v', yanchor='top', y=1, xanchor='left', x=0),
xaxis=dict(
title='Year',
tickmode='linear',
dtick=1,
tickangle=55
),
)
fig.update_yaxes(
title_text='Avg Transfer Fee (€)',
secondary_y=False
)
fig.update_yaxes(
title_text='Transfers',
secondary_y=True
)
fig.show()
3.6. Number of Transfers of more than 10 mill. € by Year¶
In [125]:
# Filter transfers over 10M from 2008
df_over_10m = (
df_tm_transfers_cleaned
.dropna(subset=['transfer_fee', 'transfer_date'])
.loc[df_tm_transfers_cleaned['transfer_fee'] > 10_000_000]
.assign(year=lambda d: d['transfer_date'].dt.year)
.loc[lambda d: d['year'] >= 2008]
)
# Count by year
transfers_over_10m_by_year = (
df_over_10m.groupby('year', as_index=False)
.agg(n_transfers=('transfer_fee', 'size'))
.sort_values('year')
)
# Calculate 3-year rolling average
transfers_over_10m_by_year['roll3'] = (
transfers_over_10m_by_year['n_transfers']
.rolling(3, center=True, min_periods=2)
.mean()
)
# Bar graph with continuous colors
fig = px.bar(
transfers_over_10m_by_year,
x='year',
y='n_transfers',
color='n_transfers', # <- color continuo según valor
text='n_transfers',
title='Number of Transfers > €10 mill. by Year',
hover_data={'year': True}
)
fig.update_traces(textposition='outside', cliponaxis=False)
# Add trend line
fig.add_trace(go.Scatter(
x=transfers_over_10m_by_year['year'],
y=transfers_over_10m_by_year['roll3'],
mode='lines+markers',
name='3Y Rolling Avg',
hovertemplate='Year: %{x}<br>3Y Rolling Avg: %{y:.0f}<extra></extra>'
))
# Style the graph
fig.update_layout(
legend=dict(
orientation='v',
yanchor='top',
y=1,
xanchor='left',
x=0
),
template='plotly_dark',
xaxis_title='Year',
yaxis_title='Number of Transfers > €10 mill.',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
xaxis=dict(
tickmode='array',
tickvals=transfers_over_10m_by_year['year'],
ticktext=transfers_over_10m_by_year['year'],
tickangle=55,
rangeslider=dict(visible=False),
rangeselector=dict(visible=False)
),
coloraxis_colorbar=dict(title='Transfers')
)
fig.show()
3.7. Market Growth over the last 10 Years¶
In [126]:
# Base: year, cleaning and aggregations
df_base = (
df_tm_transfers_cleaned
.dropna(subset=['transfer_date'])
.assign(year=lambda d: d['transfer_date'].dt.year)
)
# Count of transfers
counts = (
df_base
.groupby('year', as_index=True)
.size()
.rename('n_transfers')
.sort_index()
)
# Money moved
money = (
df_base
.dropna(subset=['transfer_fee'])
.groupby('year', as_index=True)['transfer_fee']
.sum()
.rename('total_fee')
.sort_index()
)
# Ensure continuity of years (fill with 0)
all_years = pd.Index(range(min(counts.index.min(), money.index.min()),
max(counts.index.max(), money.index.max())+1), name='year')
counts = counts.reindex(all_years, fill_value=0)
money = money.reindex(all_years, fill_value=0)
# Window: last 10 years available
end_year = int(all_years.max())
start_year = max(int(all_years.min()), end_year - 9)
counts_10 = counts.loc[start_year:end_year]
money_10 = money.loc[start_year:end_year]
years_diff = end_year - start_year # number of years between extremes (for CAGR)
def safe_cagr(end_val, start_val, years):
if years <= 0:
return np.nan
if start_val <= 0:
return np.nan # CAGR not defined if the start is 0 or negative
return (end_val / start_val) ** (1/years) - 1
cagr_counts = safe_cagr(counts_10.iloc[-1], counts_10.iloc[0], years_diff)
cagr_money = safe_cagr(money_10.iloc[-1], money_10.iloc[0], years_diff)
# Graph 1: Number of transfers and CAGR
fig1 = px.bar(
counts_10.reset_index(),
x='year',
y='n_transfers',
color='n_transfers',
text='n_transfers',
title=f'Number of Transfers (Including Free Transfers) per Year — CAGR {("" if pd.notna(cagr_counts) else "N/A") if pd.isna(cagr_counts) else f"{cagr_counts*100:.1f}%"} ({start_year}–{end_year})',
hover_data={'year': True}
)
fig1.update_traces(textposition='outside', cliponaxis=False)
fig1.update_layout(
template='plotly_dark',
xaxis_title='Year',
yaxis_title='Transfers',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
xaxis=dict(tickmode='array',
tickvals=counts_10.index,
ticktext=counts_10.index.astype(str),
tickangle=55),
coloraxis_colorbar=dict(title='Transfers')
)
fig1.show()
# Graph 2: Money moved and CAGR
fig2 = px.bar(
money_10.reset_index(),
x='year',
y='total_fee',
color='total_fee',
text=money_10.reset_index()['total_fee'].map(lambda v: f"€{v:,.0f}"),
title=f'Total Money Moved per Year — CAGR {("" if pd.notna(cagr_money) else "N/A") if pd.isna(cagr_money) else f"{cagr_money*100:.1f}%"} ({start_year}–{end_year})',
hover_data={'year': True}
)
fig2.update_traces(textposition='outside', cliponaxis=False)
fig2.update_layout(
template='plotly_dark',
xaxis_title='Year',
yaxis_title='Aggregated Fees',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
xaxis=dict(tickmode='array',
tickvals=money_10.index,
ticktext=money_10.index.astype(str),
tickangle=55),
coloraxis_colorbar=dict(title='Fees')
)
fig2.show()
4. Club Analysis¶
4.1. Average Spending by Club¶
In [127]:
# Filtering
df_club_spending = (
df_tm_transfers_cleaned
.dropna(subset=['transfer_fee', 'transfer_date', 'to_club_id'])
.assign(year=lambda d: d['transfer_date'].dt.year)
.loc[lambda d: d['year'] >= 2008]
)
# Total spending per club and year
club_year_spending = (
df_club_spending
.groupby(['year', 'to_club_id'], as_index=False)
.agg(total_spent=('transfer_fee', 'sum'))
)
# Average spending per club per year
avg_spending_per_club = (
club_year_spending
.groupby('year', as_index=False)
.agg(avg_spent_per_club=('total_spent', 'mean'))
)
# Calculate CAGR
start_year = int(avg_spending_per_club['year'].min())
end_year = int(avg_spending_per_club['year'].max())
years_diff = end_year - start_year
start_val = avg_spending_per_club.loc[avg_spending_per_club['year'] == start_year, 'avg_spent_per_club'].iloc[0]
end_val = avg_spending_per_club.loc[avg_spending_per_club['year'] == end_year, 'avg_spent_per_club'].iloc[0]
if start_val > 0:
cagr = (end_val / start_val) ** (1 / years_diff) - 1
else:
cagr = np.nan
# Interactive graph
fig = px.bar(
avg_spending_per_club,
x='year',
y='avg_spent_per_club',
color='avg_spent_per_club',
text=avg_spending_per_club['avg_spent_per_club'].map(lambda v: f"€{v:,.0f}"),
title=f'Average Spending per Club by Year — CAGR {cagr*100:.1f}%',
hover_data={'year': True}
)
fig.update_traces(textposition='outside', cliponaxis=False)
fig.update_layout(
template='plotly_dark',
xaxis_title='Year',
yaxis_title='Average Spending per Club',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
xaxis=dict(
tickmode='array',
tickvals=avg_spending_per_club['year'],
ticktext=avg_spending_per_club['year'],
tickangle=55
),
coloraxis_colorbar=dict(title='Spending (€)')
)
fig.show()
4.2. Percentage of Profitable Clubs by Season¶
In [128]:
# Base: year, cleaning and aggregations
df_base = (
df_tm_transfers_cleaned
.dropna(subset=['transfer_fee', 'transfer_date'])
.assign(year=lambda d: d['transfer_date'].dt.year)
.loc[lambda d: d['year'] >= 2008]
)
# Income by club-year (club as seller)
income_by_club_year = (
df_base
.groupby(['year', 'from_club_id'], as_index=False)
.agg(total_income=('transfer_fee', 'sum'))
.rename(columns={'from_club_id': 'club_id'})
)
# Spend by club-year (club as buyer)
spend_by_club_year = (
df_base
.groupby(['year', 'to_club_id'], as_index=False)
.agg(total_spent=('transfer_fee', 'sum'))
.rename(columns={'to_club_id': 'club_id'})
)
# Join income and spend
club_balance = (
pd.merge(income_by_club_year, spend_by_club_year,
on=['year', 'club_id'], how='outer')
.fillna(0)
)
# Flag of profitable
club_balance['profitable'] = club_balance['total_income'] > club_balance['total_spent']
# Percentage of profitable clubs per year
profitability_by_year = (
club_balance
.groupby('year', as_index=False)
.agg(
pct_profitable=('profitable', lambda x: 100 * x.mean()),
n_clubs=('club_id', 'nunique')
)
.sort_values('year')
)
# Graph
fig = px.bar(
profitability_by_year,
x='year',
y='pct_profitable',
color='pct_profitable',
text=profitability_by_year['pct_profitable'].map(lambda v: f"{v:.1f}%"),
title='Percentage of Profitable Clubs by Year',
hover_data={'year': True, 'n_clubs': True}
)
fig.update_traces(textposition='outside', cliponaxis=False)
fig.update_layout(
template='plotly_dark',
xaxis_title='Year',
yaxis_title='Profitable Clubs',
hovermode='x unified',
margin=dict(l=40, r=20, t=60, b=80),
xaxis=dict(
tickmode='array',
tickvals=profitability_by_year['year'],
ticktext=profitability_by_year['year'],
tickangle=55
),
coloraxis_colorbar=dict(title='% Profitable')
)
fig.show()
4.3. Percentage of Profitable Clubs of the last Decade¶
In [129]:
# Base: year, cleaning and aggregations
df_base = (
df_tm_transfers_cleaned
.dropna(subset=['transfer_fee', 'transfer_date'])
.assign(year=lambda d: d['transfer_date'].dt.year)
.loc[lambda d: d['year'] >= 2015]
)
# Total income by club
income_by_club = (
df_base
.groupby('from_club_id', as_index=False)
.agg(total_income=('transfer_fee', 'sum'))
.rename(columns={'from_club_id': 'club_id'})
)
# Total spend by club
spend_by_club = (
df_base
.groupby('to_club_id', as_index=False)
.agg(total_spent=('transfer_fee', 'sum'))
.rename(columns={'to_club_id': 'club_id'})
)
# Join income and spend
club_balance_total = (
pd.merge(income_by_club, spend_by_club,
on='club_id', how='outer')
.fillna(0)
)
# Flag of profitable
club_balance_total['profitable'] = club_balance_total['total_income'] > club_balance_total['total_spent']
# Data for donut
values = [
club_balance_total['profitable'].sum(),
(~club_balance_total['profitable']).sum()
]
labels = ['Profitable', 'Not Profitable']
# Donut graph with custom colors
fig = px.pie(
names=labels,
values=values,
title='Percentage of Profitable Clubs of the last Decade (2015-2024)',
hole=0.5,
color=labels,
color_discrete_map={
'Profitable': '#f48c45',
'Not Profitable': '#5202a2'
}
)
# Style and text
fig.update_traces(
textinfo='label+percent',
textposition='inside',
insidetextorientation='horizontal',
insidetextfont=dict(color='white'),
outsidetextfont=dict(color='white'),
texttemplate='%{label}<br>%{percent}',
marker=dict(line=dict(color='#000000', width=2))
)
fig.update_layout(
template='plotly_dark',
legend=dict(
orientation='h',
yanchor='bottom',
y=-0.15,
xanchor='center',
x=0.5
),
margin=dict(l=40, r=40, t=60, b=40)
)
fig.show()